Exploratory Data Analysis - Temperature

1 Import Packages

pacman::p_load(tidyverse, readr, psych, st, stars, tmap, sf,
               ggstatsplot, plotly, ggplot2, ggdist, dplyr, ggiraph)

2 Load the prepared files

Let’s load the RDS files after data preparation.

temperature <- readRDS("data/temperature.rds")
describe(temperature)
         vars    n    mean    sd median trimmed  mad    min    max range  skew
Station*    1 3715   10.00  5.37   10.0   10.12 7.41    1.0   18.0  17.0 -0.12
Region*     2 3715    3.33  1.41    3.0    3.41 1.48    1.0    5.0   4.0 -0.24
Year        3 3715 2010.64 10.07 2013.0 2011.83 8.90 1982.0 2023.0  41.0 -0.96
Month*      4 3715    6.53  3.45    7.0    6.54 4.45    1.0   12.0  11.0 -0.01
Date        5 3715     NaN    NA     NA     NaN   NA    Inf   -Inf  -Inf    NA
MeanTemp    6 3715   27.70  0.85   27.7   27.70 0.89   24.9   30.0   5.1 -0.05
MaxTemp     7 3715   33.85  0.98   33.9   33.86 0.89   30.4   37.9   7.5 -0.11
MinTemp     8 3715   22.62  0.94   22.6   22.63 0.74    0.0   26.2  26.2 -7.42
         kurtosis   se
Station*    -1.39 0.09
Region*     -1.30 0.02
Year        -0.02 0.17
Month*      -1.21 0.06
Date           NA   NA
MeanTemp    -0.40 0.01
MaxTemp      0.28 0.02
MinTemp    178.72 0.02

3 Map of Singapore

mpsz <- st_read(dsn = "data/geospatial", layer = "MPSZ-2019") %>% 
  st_transform(crs=3414)
Reading layer `MPSZ-2019' from data source 
  `C:\Vanessa\SMU\Term 4 - Visual Analytics & Applications\mvheng\Group11_VAP\EDA\data\geospatial' 
  using driver `ESRI Shapefile'
Simple feature collection with 332 features and 6 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 103.6057 ymin: 1.158699 xmax: 104.0885 ymax: 1.470775
Geodetic CRS:  WGS 84
glimpse(mpsz)
Rows: 332
Columns: 7
$ SUBZONE_N  <chr> "MARINA EAST", "INSTITUTION HILL", "ROBERTSON QUAY", "JURON…
$ SUBZONE_C  <chr> "MESZ01", "RVSZ05", "SRSZ01", "WISZ01", "MUSZ02", "MPSZ05",…
$ PLN_AREA_N <chr> "MARINA EAST", "RIVER VALLEY", "SINGAPORE RIVER", "WESTERN …
$ PLN_AREA_C <chr> "ME", "RV", "SR", "WI", "MU", "MP", "WI", "WI", "SI", "SI",…
$ REGION_N   <chr> "CENTRAL REGION", "CENTRAL REGION", "CENTRAL REGION", "WEST…
$ REGION_C   <chr> "CR", "CR", "CR", "WR", "CR", "CR", "WR", "WR", "CR", "CR",…
$ geometry   <MULTIPOLYGON [m]> MULTIPOLYGON (((33222.98 29..., MULTIPOLYGON (…

Let’s take a look at the planning areas for the 5 regions.

tmap_mode("view")

tm_shape(mpsz) +
  tm_polygons(col = "REGION_N", palette = "Set2")+
  tm_layout(main.title = "Planning Area",
            main.title.position = "left",
            main.title.size = 1,
            legend.show = FALSE,
            frame = FALSE) +
  tmap_options(check.and.fix = TRUE) +
  tm_view(set.zoom.limits = c(11,12))

4 Temperature analysis

4.1 Analyse temperature using maps

Let’s map the station to the planning area (PA).

Show the code
station_to_PA <- c(
  "Admiralty" = "WOODLANDS",
  "Ang Mo Kio" = "ANG MO KIO",
  "Boon Lay (East)" = "BOON LAY",
  "Changi" = "CHANGI",
  "Choa Chu Kang (South)" = "CHOA CHU KANG",
  "Clementi" = "CLEMENTI",
  "East Coast Parkway" = "BEDOK",
  "Jurong (West)" = "JURONG WEST",
  "Khatib" = "YISHUN",
  "Marina Barrage" = "DOWNTOWN CORE",
  "Newton" = "NEWTON",
  "Pasir Panjang" = "PASIR PANJANG",
  "Paya Lebar" = "PAYA LEBAR",
  "Seletar" = "SELETAR",
  "Sembawang" = "SEMBAWANG",
  "Tai Seng" = "HOUGANG",
  "Tengah" = "TENGAH",
  "Tuas South" = "TUAS"
)

temperature$PA <- station_to_PA[temperature$Station]
temperature <- temperature[, c("PA", setdiff(names(temperature), "PA"))]
head(temperature)
# A tibble: 6 × 9
  PA        Station   Region  Year Month Date       MeanTemp MaxTemp MinTemp
  <chr>     <chr>     <chr>  <dbl> <ord> <date>        <dbl>   <dbl>   <dbl>
1 WOODLANDS Admiralty North   2009 Jan   2009-01-01     26.3    31.9    23.3
2 WOODLANDS Admiralty North   2009 Feb   2009-02-01     26.8    33.4    23  
3 WOODLANDS Admiralty North   2009 Mar   2009-03-01     26.9    34.5    22.2
4 WOODLANDS Admiralty North   2009 Apr   2009-04-01     28.1    35.1    23.7
5 WOODLANDS Admiralty North   2009 May   2009-05-01     28.5    34.7    21.8
6 WOODLANDS Admiralty North   2009 Jun   2009-06-01     28.9    34.7    23.7
temp_map <- temperature %>% 
  group_by(PA, Station, Year) %>% 
  summarise(Annual_Mean_Temperature = 
              mean(MeanTemp, na.rm = TRUE),
            Annual_Maximum_Temperature = 
              max(MaxTemp, na.rm = TRUE),
            Annual_Minimum_Temperature = 
              min(MinTemp, na.rm = TRUE)) %>%
  ungroup()

glimpse(temp_map)
Rows: 323
Columns: 6
$ PA                         <chr> "ANG MO KIO", "ANG MO KIO", "ANG MO KIO", "…
$ Station                    <chr> "Ang Mo Kio", "Ang Mo Kio", "Ang Mo Kio", "…
$ Year                       <dbl> 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2…
$ Annual_Mean_Temperature    <dbl> 27.40000, 27.71667, 27.31667, 27.50833, 27.…
$ Annual_Maximum_Temperature <dbl> 34.5, 36.0, 35.4, 34.8, 35.6, 35.0, 34.9, 3…
$ Annual_Minimum_Temperature <dbl> 21.8, 21.7, 21.5, 21.8, 20.0, 21.8, 20.3, 2…
mpsztemp <- left_join(mpsz, temp_map,
                         by = c("PLN_AREA_N" = "PA"))
glimpse(mpsztemp)
Rows: 2,357
Columns: 12
$ SUBZONE_N                  <chr> "MARINA EAST", "INSTITUTION HILL", "ROBERTS…
$ SUBZONE_C                  <chr> "MESZ01", "RVSZ05", "SRSZ01", "WISZ01", "MU…
$ PLN_AREA_N                 <chr> "MARINA EAST", "RIVER VALLEY", "SINGAPORE R…
$ PLN_AREA_C                 <chr> "ME", "RV", "SR", "WI", "MU", "MP", "WI", "…
$ REGION_N                   <chr> "CENTRAL REGION", "CENTRAL REGION", "CENTRA…
$ REGION_C                   <chr> "CR", "CR", "CR", "WR", "CR", "CR", "WR", "…
$ Station                    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ Year                       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ Annual_Mean_Temperature    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ Annual_Maximum_Temperature <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ Annual_Minimum_Temperature <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ geometry                   <MULTIPOLYGON [m]> MULTIPOLYGON (((33222.98 29...…

Let’s plot the annual mean temperature distribution across Singapore.

tm_shape(mpsztemp) +
  tm_polygons(col = "Annual_Mean_Temperature", 
              palette = "Blues", 
              style = "jenks") +
  tm_view(set.zoom.limits = c(11,12))
Note

It seems like the northern area of Singapore has a cooler mean temperature.

Let’s compare the maximum and minimum temperatures.

tm_shape(mpsztemp) +
  tm_polygons(col = "Annual_Maximum_Temperature", 
              palette = "Blues", 
              style = "jenks") +
  tm_view(set.zoom.limits = c(11,12))
tm_shape(mpsztemp) +
  tm_polygons(col = "Annual_Minimum_Temperature", 
              palette = "Blues", 
              style = "jenks") +
  tm_view(set.zoom.limits = c(11,12))

4.2 Temperature Time Series

4.2.1 Overall - Temperature Time Series

Show the code
gg <- ggplot(temperature, aes(x = Date, y = MeanTemp, 
                         color = factor(Year))) +
    geom_line(linewidth = 0.1) +
    geom_point(aes(text = paste0("Month:", Month, 
                                "<br>MeanTemp:", MeanTemp, "ºC"))) +
    labs(x = "Year", y = "Monthly mean temperature (ºC)", color = "Year",
         title = "Trend of Monthly Mean Temperature from 1981 to 2023", 
         subtitle = "Gentle trend line sloping upwards from 1981",
         caption = "Data from Meteorological Service Singapore website") +
    geom_smooth(method = "lm", 
                se = FALSE, color = "black") +
    theme_minimal() 

ggplotly(gg, tooltip = "text") %>%
    layout(title = list(text = 
                        paste0(gg$labels$title, "<br>", "<sup>", 
                               gg$labels$subtitle, "</sup>"),
                        font = list(weight = "bold")),
           showlegend = FALSE,
    annotations = list(text = gg$labels$caption,
                      xref = "paper", yref = "paper",
                      x = 1000, y = 24,
                      xanchor = "right", yanchor = "top",
                      showarrow = FALSE)) 

4.2.2 Temperature Time Series by station

Show the code
Temp_station <- temperature %>%
  group_by(Station, Year) %>%
  summarise(Temp = mean(MeanTemp, na.rm = TRUE)) %>%
  ungroup()

Temp_station$mean_tooltip <- c(paste0(
  "Year: ", Temp_station$Year,
  "\n Station: ", Temp_station$Station,
  "\n Mean Temp: ", Temp_station$Temp, "°C"))

line <- ggplot(data = Temp_station,
               aes(x = Year,
                   y = Temp,
                   group = Station,
                   color = Station,
                   data_id = Station)) +
  geom_line_interactive(size = 1.2,
                        alpha = 0.4) +
  geom_point_interactive(aes(tooltip = Temp_station$mean_tooltip),
                         fill = "white",
                         size = 1,
                         stroke = 1,
                         shape = 21) +
  theme_classic() +
  ylab("Annual Mean Temperature (°C)") +
  xlab("Year") +
  ggtitle("Annual Average of Mean Temperatures") +
  theme(plot.title = element_text(size = 10),
        plot.subtitle = element_text(size = 8)) 

girafe(ggobj = line, 
       width_svg = 8,
       height_svg = 6 * 0.618,
       options = list(
         opts_hover(css = "stroke-width: 2.5; opacity: 1;"),
         opts_hover_inv(css = "stroke-width: 1;opacity:0.6;")))

4.3 Confidence Interval of Mean Temperature

Show the code
Temp_yr_error <- temperature %>%
  group_by(Year) %>%
  summarise(n = n(), Temp = mean(MeanTemp, na.rm = TRUE), 
            sd = sd(MeanTemp, na.rm = TRUE)) %>%
  mutate(se = sd/sqrt(n-1)) %>% 
  ungroup()

model <- lm(Temp ~ Year, Temp_yr_error)
y_intercept = coef(model)[1] 
slope_coeff = coef(model)[2]
adjust_yintercept = slope_coeff * 1982 + y_intercept

gg <- ggplot(Temp_yr_error) +
       geom_errorbar(aes(x = factor(Year), ymin = Temp - 2.58 * se, 
                      ymax = Temp+2.58*se), 
                      width=0.2, colour="black", 
                      alpha=0.9, size=0.5) +
       geom_point(aes(x = factor(Year), y = Temp, 
             text = paste0("Year:", `Year`, 
                          "<br>Avg. Temp:", round(Temp, digits = 2),
                          "<br>95% CI:[", 
                          round((Temp - 2.58 * se), digits = 2), ",",
                          round((Temp + 2.58 * se), digits = 2),"]")),
             stat="identity", color="darkred", 
             size = 1.5, alpha = 1) +
       geom_abline(slope = round(slope_coeff, 4), 
                   intercept = adjust_yintercept,
                   untf = TRUE,
                   color = "blue",
                   linetype = "dashed")+
       geom_text(aes(x = 11, y = 27.8, colour = "blue",
                     label = paste0("Temp=", 
                                    round(slope_coeff, 4), "* Year ",
                                    round(y_intercept, 4)))) +
       labs (x = "Year", y = "Annual mean temperatures (°C)",
             title = "99% Confidence interval of annual mean temperatures by year",
             subtitle = "From 1982 to 2023",
             caption = "Data from Meteorological Service Singapore website") +
       theme_minimal() + 
       theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1),
             plot.title = element_text(face = "bold", size = 12))

ggplotly(gg, tooltip = "text") %>%
    layout(title = list(text = 
                        paste0(gg$labels$title, "<br>", "<sup>", 
                               gg$labels$subtitle, "</sup>"),
                        font = list(weight = "bold")),
           showlegend = FALSE)
Note

We can observe that the mean temperature over the years have increased and the confidence intervals have narrowed.

4.4 Temperature across the months

4.4.1 Box plot across the months

Show the code
gg <- ggplot(temperature, 
       aes(x = factor(Month, levels = month.abb), y = MeanTemp)) +
  geom_violin(color = "navy", fill = "lightblue") +
  geom_hline(data = temperature, 
             aes(yintercept = mean(MeanTemp, na.rm = TRUE)),
             linetype = "dashed", size = 1, colour = "brown") +
  geom_text(aes(x = 4.5, y = 27.3, 
                 label = paste0("Mean : ", 
                                round(mean(MeanTemp,na.rm = TRUE),2), "°C")), 
            colour = "brown") +
  stat_summary(fun = mean, geom = "point", 
               shape = 20, size = 3, color = "orange",
               aes(text = paste0("Mean : ", round(after_stat(y), 2), "°C"))) +
  theme_minimal() +
  labs(title = "Monthly mean temperature across each month from 1981 to 2023",
       subtitle = "November to February are cooler as compared to the rest of the year",
        y = "Daily mean Temperatures (°C)",
        x = "Month",
        caption = "Data from Meteorological Service Singapore website")

ggplotly(gg, tooltip = "text") %>%
    layout(title = list(text =
                        paste0(gg$labels$title, "<br>", "<sup>",
                               gg$labels$subtitle, "</sup>"),
                        font = list(weight = "bold")))
Note

We can observe that temperature in November to February are below the average temperature.

4.4.2 Heatmap across the months

Show the code
Temp <- temperature %>% 
        group_by(Year, Month) %>% 
        summarise(MTemp = mean(MeanTemp, na.rm = TRUE))

gg <- ggplot(Temp, aes(factor(Month, levels = month.abb), factor(Year), 
                          fill = MTemp)) + 
    geom_tile(color = "white",
              aes(text = paste0(Year, "-", Month,
                                "<br>Temp:", round(MTemp, 2), "°C"))) + 
    theme_minimal() + 
    scale_fill_gradient(name = "Temperature",
                        low = "sky blue", 
                        high = "dark blue") +
    labs(x = NULL, y = NULL, 
         title = "Mean temperatures by year and month",
         subtitle = "Hotter in more months of 2023 as compared to the other years")

ggplotly(gg, tooltip = "text")
Note

We can observe that temperature in May and June are consistently high across the years.